jupyter: python3 title: embedding eda¶

In [1]:
import altair as alt
import os
import umap
import numpy as np

import matplotlib.pyplot as plt
import pandas as pd
import json
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.patches as mpatches


from CryptoFraudDetection.utils import embedding
from CryptoFraudDetection.utils import enums
from CryptoFraudDetection.utils import logger

alt.themes.enable("dark")
Out[1]:
ThemeRegistry.enable('dark')
In [2]:
df = pd.read_parquet("../data/processed/reddit_posts.parquet")
df.head(5)
Out[2]:
id parent_id author body created depth edited score search_query subreddit title url num_comments
0 yxu5tv <NA> magus-21 Secretly lending customer funds, market-making... 2022-11-17 16:10:14 -1 <NA> 1597 Safe Moon r/CryptoCurrency "DYOR" is worthless. You can't "Do Your Own Re... https://www.reddit.com/r/CryptoCurrency/commen... 634
1 iwqji72 yxu5tv Hank___Scorpio Some peoples research led them to safemoon. 2022-11-17 16:34:34 0 <NA> 291 Safe Moon r/CryptoCurrency <NA> <NA> <NA>
2 iwqnjit iwqji72 getoffthepitch96576 I remember the whitepaper was a power point pr... 2022-11-17 17:01:23 1 <NA> 88 Safe Moon r/CryptoCurrency <NA> <NA> <NA>
3 iwqye4z iwqnjit TheTrueBlueTJ Not to mention the shitfountain that was their... 2022-11-17 18:11:55 2 <NA> 16 Safe Moon r/CryptoCurrency <NA> <NA> <NA>
4 iwsv07b iwqnjit joikhuu Still better than bitconnect's. 2022-11-18 02:20:20 2 <NA> 2 Safe Moon r/CryptoCurrency <NA> <NA> <NA>
In [3]:
df["search_query"].value_counts()
Out[3]:
search_query
Bitcoin       142163
Chainlink      72875
Ethereum       72409
Safe Moon      69582
Cosmos         57082
Avalanche      35673
FTX Token      22597
THORChain      18772
Terra Luna      8984
BitForex        2761
BeerCoin         819
Teddy Doge       738
Name: count, dtype: Int64
In [4]:
subreddit_query = (
    df.groupby(["search_query", "subreddit"])
    .size()
    .unstack()
    .fillna(0)
    .astype(int)
    .reset_index()
    .melt(id_vars="search_query")
)

c = (
    alt.Chart(subreddit_query)
    .mark_bar()
    .encode(
        x=alt.X("value:Q", title="Number of Comments", scale={"domain": [0, 152_000]}),
        y=alt.Y("search_query:O", title="Coin"),
        color=alt.Color("subreddit:N", title="Subreddit"),
        tooltip=[
            alt.Tooltip("search_query:O", title="Coin"),
            alt.Tooltip("subreddit:N", title="Subreddit"),
            alt.Tooltip("value:Q", title="# of Comments"),
        ],
    )
    .properties(
        width=800,
        height=400,
        title=alt.Title(text="Number of Comments per Subreddit"),
    )
)


text = (
    alt.Chart(subreddit_query)
    .mark_text(align="left", dx=5, color="white")
    .transform_calculate(customtooltip="datum.value")
    .encode(
        x=alt.X("sum(value):Q", scale={"domain": [0, 152_000]}),
        y=alt.Y("search_query:O"),
        text=alt.Text("sum(value):Q"),
    )
)

c + text
Out[4]:
In [5]:
subreddit_query_normalized = subreddit_query.copy()
posts_per_coin = subreddit_query_normalized.groupby("search_query")["value"].transform(
    "sum"
)
subreddit_query_normalized["value"] /= posts_per_coin


c = (
    alt.Chart(subreddit_query_normalized)
    .mark_rect()
    .encode(
        y=alt.Y("subreddit:N", title="Subreddit"),
        x=alt.X("search_query:O", title="Coin"),
        color=alt.Color(
            "value:Q",
            title="Number of Comments",
            scale=alt.Scale(
                range=["#ffffff", "#deecfb", "#bedaf7", "#7ab3ef", "#368ce7", "#1666ba"]
            ),
            legend=None,
        ),
        tooltip=[
            alt.Tooltip("search_query:O", title="Coin"),
            alt.Tooltip("subreddit:N", title="Subreddit"),
            alt.Tooltip("value:Q", title=r"% of Comments", format=".4%"),
        ],
    )
    .properties(
        width=800,
        height=800,
        title=alt.Title(text="Number of Comments per Subreddit (Normalized by Coin)"),
    )
)

text = (
    alt.Chart(subreddit_query_normalized)
    .mark_text(align="center")
    .transform_calculate(customtooltip="datum.value")
    .encode(
        y=alt.Y("subreddit:N"),
        x=alt.X("search_query:O"),
        text=alt.Text("value:Q", format=".2%"),
        color=alt.condition(
            alt.datum.value >= 0.5, alt.value("white"), alt.value("black")
        ),
        tooltip=[
            alt.Tooltip("search_query:O", title="Coin"),
            alt.Tooltip("subreddit:N", title="Subreddit"),
            alt.Tooltip("value:Q", title=r"% of Comments", format=".4%"),
        ],
    )
)

c + text
Out[5]:
In [6]:
subreddit_query_posts = (
    df.query("depth == -1")
    .groupby(["search_query", "subreddit"])
    .size()
    .unstack()
    .fillna(0)
    .astype(int)
    .reset_index()
    .melt(id_vars="search_query")
)

c = (
    alt.Chart(subreddit_query_posts)
    .mark_bar()
    .encode(
        x=alt.X("value:Q", title="Number of Comments", scale={"domain": [0, 1_400]}),
        y=alt.Y("search_query:O", title="Coin"),
        color=alt.Color("subreddit:N", title="Subreddit"),
        tooltip=[
            alt.Tooltip("search_query:O", title="Coin"),
            alt.Tooltip("subreddit:N", title="Subreddit"),
            alt.Tooltip("value:Q", title="# of Comments"),
        ],
    )
    .properties(
        width=800,
        height=400,
        title=alt.Title(text="Number of Posts per Subreddit"),
    )
)


text = (
    alt.Chart(subreddit_query_posts)
    .mark_text(align="left", dx=5, color="white")
    .transform_calculate(customtooltip="datum.value")
    .encode(
        x=alt.X("sum(value):Q", scale={"domain": [0, 1_400]}),
        y=alt.Y("search_query:O"),
        text=alt.Text("sum(value):Q"),
    )
)

c + text
Out[6]:
In [7]:
subreddit_query_posts_normalized = subreddit_query_posts.copy()
posts_per_coin = subreddit_query_posts_normalized.groupby("search_query")[
    "value"
].transform("sum")
subreddit_query_posts_normalized["value"] /= posts_per_coin

c = (
    alt.Chart(subreddit_query_posts_normalized)
    .mark_rect()
    .encode(
        y=alt.Y("subreddit:N", title="Subreddit"),
        x=alt.X("search_query:O", title="Coin"),
        color=alt.Color(
            "value:Q",
            title="Number of Posts",
            scale=alt.Scale(
                range=["#ffffff", "#deecfb", "#bedaf7", "#7ab3ef", "#368ce7", "#1666ba"]
            ),
            legend=None,
        ),
        tooltip=[
            alt.Tooltip("search_query:O", title="Coin"),
            alt.Tooltip("subreddit:N", title="Subreddit"),
            alt.Tooltip("value:Q", title=r"% of Posts", format=".4%"),
        ],
    )
    .properties(
        width=800,
        height=800,
        title=alt.Title(text="Number of Posts per Subreddit (Normalized by Coin)"),
    )
)

text = (
    alt.Chart(subreddit_query_posts_normalized)
    .mark_text(align="center")
    .transform_calculate(customtooltip="datum.value")
    .encode(
        y=alt.Y("subreddit:N"),
        x=alt.X("search_query:O"),
        text=alt.Text("value:Q", format=".2%"),
        color=alt.condition(
            alt.datum.value > 0.7, alt.value("white"), alt.value("black")
        ),
        tooltip=[
            alt.Tooltip("search_query:O", title="Coin"),
            alt.Tooltip("subreddit:N", title="Subreddit"),
            alt.Tooltip("value:Q", title=r"% of Posts", format=".4%"),
        ],
    )
)

c + text
Out[7]:
In [8]:
depth_df = df["depth"].value_counts().sort_index()

c = (
    alt.Chart(depth_df.reset_index())
    .mark_bar()
    .encode(
        x=alt.X("depth:O", title="Depth (-1 is the original post)"),
        y=alt.Y("count:Q", title="Number of Comments", scale={"domain": [0, 240_000]}),
        tooltip=[
            alt.Tooltip("depth:O", title="Depth"),
            alt.Tooltip("count:Q", title="# of Comments"),
        ],
    )
    .properties(
        width=800,
        height=400,
        title=alt.Title(text="Number of Comments per Depth"),
    )
)

text = (
    alt.Chart(depth_df.reset_index())
    .mark_text(align="center", dy=-8, color="white")
    .transform_calculate(customtooltip="datum.count")
    .encode(
        x=alt.X("depth:O"),
        y=alt.Y("count:Q", scale={"domain": [0, 240_000]}),
        text=alt.Text("count:Q"),
    )
)

c + text
Out[8]:
In [9]:
comments_per_user = df.groupby("author").size()
distribution = comments_per_user.value_counts().sort_index()
distribution_df = distribution.reset_index()
distribution_df.columns = ["comments_per_user", "number_of_users"]

c = (
    alt.Chart(distribution_df)
    .mark_bar()
    .encode(
        x=alt.X(
            "comments_per_user:O",
            title="Number of Comments per User",
            scale={"domain": range(1, 101)},
        ),
        y=alt.Y(
            "number_of_users:Q",
            title="Number of Users",
            scale=alt.Scale(
                type="symlog",
                domain=[0, 100_000],
            ),
            axis=alt.Axis(
                values=[
                    0,
                    1,
                    2,
                    5,
                    10,
                    20,
                    50,
                    100,
                    200,
                    500,
                    1_000,
                    2_000,
                    5_000,
                    10_000,
                    20_000,
                    50_000,
                    100_000,
                ]
            ),
        ),
        tooltip=[
            alt.Tooltip("comments_per_user:Q", title="# of Comments per User"),
            alt.Tooltip("number_of_users:Q", title="# of Users"),
        ],
    )
    .properties(
        width=1000,
        height=400,
        title=alt.Title(text="Number of Comments per User"),
    )
)

c
Out[9]:
In [10]:
comments_per_user.sort_values(ascending=False).head(10)
Out[10]:
author
None                    39167
donut-bot                8664
AutoModerator            4385
kirtash93                1969
CointestMod              1219
Every_Hunt_160           1005
Objective_Digit           973
goldyluckinblokchain      708
partymsl                  628
coinfeeds-bot             626
dtype: int64
In [11]:
posts_per_user = df.query("depth == -1").groupby("author").size()
distribution = posts_per_user.value_counts().sort_index()
distribution_df = distribution.reset_index()
distribution_df.columns = ["posts_per_user", "number_of_users"]

c = (
    alt.Chart(distribution_df)
    .mark_bar()
    .encode(
        x=alt.X(
            "posts_per_user:O",
            title="Number of Posts per User",
            scale={"domain": range(1, 101)},
        ),
        y=alt.Y(
            "number_of_users:Q",
            title="Number of Posts",
            scale=alt.Scale(
                type="symlog",
                domain=[0, 5_000],
                zero=True,
            ),
            axis=alt.Axis(
                values=[
                    0,
                    1,
                    2,
                    5,
                    10,
                    20,
                    50,
                    100,
                    200,
                    500,
                    1_000,
                    2_000,
                    5_000,
                ]
            ),
        ),
        tooltip=[
            alt.Tooltip("posts_per_user:Q", title="# of Posts per User"),
            alt.Tooltip("number_of_users:Q", title="# of Users"),
        ],
    )
    .properties(
        width=1000,
        height=400,
        title=alt.Title(text="Number of Posts per User"),
    )
)

c
Out[11]:
In [12]:
posts_per_user.sort_values(ascending=False).head(10)
Out[12]:
author
None                    382
kirtash93                63
goldyluckinblokchain     52
hiorea                   46
InclineDumbbellPress     41
Odd-Radio-8500           33
SigiNwanne               27
partymsl                 23
Ok_Source4689            22
Creative_Ad7831          16
dtype: int64
In [13]:
posts_per_coin_per_date = df.copy()
posts_per_coin_per_date["created"] = (
    pd.to_datetime(posts_per_coin_per_date["created"])
    .dt.to_period("M")
    .dt.to_timestamp()
)

posts_per_coin_per_date = posts_per_coin_per_date.query(
    "created >= '2020-01-01'"
).query("depth == -1")


posts_per_coin_per_date = (
    posts_per_coin_per_date.groupby(["search_query", "created"])["id"]
    .count()
    .reset_index()
    .rename(columns={"id": "number_of_posts"})
)

posts_per_coin_per_date["number_of_posts_rel"] = posts_per_coin_per_date[
    "number_of_posts"
] / posts_per_coin_per_date.groupby("search_query")["number_of_posts"].transform("sum")

posts_per_coin_per_date

step = 40
overlap = 1

c = (
    alt.Chart(posts_per_coin_per_date, height=step)
    .mark_area(
        interpolate="monotone",
        fillOpacity=0.5,
        stroke="lightgray",
        strokeWidth=0.5,
    )
    .encode(
        x=alt.X("created:T").title("Date").axis(grid=False),
        y=alt.Y("number_of_posts_rel:Q")
        .axis(None)
        .scale(range=[step, -step * overlap]),
        color=alt.Color("search_query:N", legend=None),
        tooltip=[
            alt.Tooltip("yearmonth(created):T", title="Date"),
            alt.Tooltip("search_query:N", title="Coin"),
            alt.Tooltip("number_of_posts:Q", title="# of Posts"),
        ],
    )
    .properties(
        width=800,
        height=step,
    )
    .facet(
        row=alt.Row("search_query:N")
        .title(None)
        .header(labelAngle=0, labelAlign="left")
    )
    .properties(
        title=alt.Title(
            text="Number of Posts per Coin per Date after 2020",
            anchor="middle",
        ),
        bounds="flush",
    )
    .configure_facet(spacing=0)
    .configure_view(stroke=None)
    .configure_title(anchor="end")
)

c
Out[13]:
In [14]:
reddit_parquet = "../data/processed/reddit_embedded.parquet"
df = pd.read_parquet(reddit_parquet)
print("DataFrame erfolgreich geladen.")
DataFrame erfolgreich geladen.
In [15]:
df.head(5)
Out[15]:
id parent_id author body created depth edited score search_query subreddit title url num_comments embedded_text
0 yxu5tv <NA> magus-21 Secretly lending customer funds, market-making... 2022-11-17 16:10:14 -1 <NA> 1597 Safe Moon r/CryptoCurrency "DYOR" is worthless. You can't "Do Your Own Re... https://www.reddit.com/r/CryptoCurrency/commen... 634 [-0.28282067, -0.2284037, 0.04313869, 0.076180...
1 iwqji72 yxu5tv Hank___Scorpio Some peoples research led them to safemoon. 2022-11-17 16:34:34 0 <NA> 291 Safe Moon r/CryptoCurrency <NA> <NA> <NA> [-0.4349563, -0.56824857, -0.2638307, 0.357553...
2 iwqnjit iwqji72 getoffthepitch96576 I remember the whitepaper was a power point pr... 2022-11-17 17:01:23 1 <NA> 88 Safe Moon r/CryptoCurrency <NA> <NA> <NA> [-0.3600267, -0.21511783, 0.08681131, 0.751270...
3 iwqye4z iwqnjit TheTrueBlueTJ Not to mention the shitfountain that was their... 2022-11-17 18:11:55 2 <NA> 16 Safe Moon r/CryptoCurrency <NA> <NA> <NA> [-0.077001244, -0.4535337, 0.20802015, 0.74446...
4 iwsv07b iwqnjit joikhuu Still better than bitconnect's. 2022-11-18 02:20:20 2 <NA> 2 Safe Moon r/CryptoCurrency <NA> <NA> <NA> [-0.44820178, -0.32241026, 0.13388954, 0.76202...
In [16]:
df.columns
Out[16]:
Index(['id', 'parent_id', 'author', 'body', 'created', 'depth', 'edited',
       'score', 'search_query', 'subreddit', 'title', 'url', 'num_comments',
       'embedded_text'],
      dtype='object')
In [17]:
coin_test = ['FTX Token', 'Safe Moon', 'Ethereum', 'Cosmos']

#cut out test coins

df = df[~df['search_query'].isin(coin_test)]
with open('../data/raw/coins.json', 'r') as f:
    coins_data = json.load(f)

coins_info_df = pd.DataFrame(coins_data)

merged_df = df.merge(coins_info_df, left_on="search_query", right_on="name", how="left")
In [18]:
embeddings = np.vstack(merged_df["embedded_text"].values)
fraud_labels = merged_df["fraud"]
pca = PCA(n_components=2)
embeddings_pca = pca.fit_transform(embeddings)

umap_reducer = umap.UMAP(n_components=2, random_state=42, metric='cosine')
embeddings_umap = umap_reducer.fit_transform(embeddings)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

colors = ['red' if fraud else 'blue' for fraud in fraud_labels]

axes[0].scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], c=colors, alpha=0.25, edgecolor='k')
axes[0].set_title("PCA of Embeddings")
axes[0].set_xlabel("PCA Component 1")
axes[0].set_ylabel("PCA Component 2")

axes[1].scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], c=colors, alpha=0.15, edgecolor='k')
axes[1].set_title("UMAP of Embeddings")
axes[1].set_xlabel("UMAP Dimension 1")
axes[1].set_ylabel("UMAP Dimension 2")

fraud_legend = [
    plt.Line2D([0], [0], marker='o', color='w', label='Fraud', markerfacecolor='red', markersize=10),
    plt.Line2D([0], [0], marker='o', color='w', label='Non-Fraud', markerfacecolor='blue', markersize=10)
]
fig.legend(handles=fraud_legend, loc="upper right")

plt.tight_layout()
plt.show()
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
No description has been provided for this image

Here we can see the embeddings plottet as pca and umap top 2 componnents. The color represents if the posts was about a scam or non scam coin.There are clusters of Scam and non-Scam Embeddings visible, but it could be that these are just scams about the same coin.

In [19]:
coin_labels = merged_df["search_query"].values
unique_coins = np.unique(coin_labels)
coin_colors = {coin: plt.cm.tab10(i / len(unique_coins)) for i, coin in enumerate(unique_coins)}
colors = [coin_colors[coin] for coin in coin_labels]

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], c=colors, alpha=0.1, edgecolor='k')
axes[0].set_title("PCA of Embeddings")
axes[0].set_xlabel("PCA Component 1")
axes[0].set_ylabel("PCA Component 2")

axes[1].scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], c=colors, alpha=0.1, edgecolor='k')
axes[1].set_title("UMAP of Embeddings")
axes[1].set_xlabel("UMAP Dimension 1")
axes[1].set_ylabel("UMAP Dimension 2")

legend_elements = [
    plt.Line2D([0], [0], marker='o', color='w', label=coin, markerfacecolor=coin_colors[coin], markersize=10)
    for coin in unique_coins
]
fig.legend(handles=legend_elements, loc="upper right", title="Coins")

plt.tight_layout()
plt.show()
No description has been provided for this image

Here we can see the same plot but with the color representing the coin. Bitcoin is the most common coin in the dataset and therefore the most visible. It is hard to intrepred something, because the posts of the bitcoin are so dominant.maybe there are some cluster but there are so many data points that it is also hard to see.

In [20]:
unique_keywords = df['search_query'].unique()

for keyword in unique_keywords:
    keyword_df = df[df['search_query'] == keyword]
    
    embeddings = np.vstack(keyword_df['embedded_text'].values)

    pca = PCA(n_components=2)
    embeddings_pca = pca.fit_transform(embeddings)

    umap_reducer = umap.UMAP(n_components=2, metric='cosine', random_state=42)
    embeddings_umap = umap_reducer.fit_transform(embeddings)

    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    axes[0].scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], alpha=0.7, edgecolor='k')
    axes[0].set_title(f"PCA of Embeddings for {keyword}")
    axes[0].set_xlabel("PCA Component 1")
    axes[0].set_ylabel("PCA Component 2")

    axes[1].scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], alpha=0.7, edgecolor='k')
    axes[1].set_title(f"UMAP of Embeddings for {keyword}")
    axes[1].set_xlabel("UMAP Dimension 1")
    axes[1].set_ylabel("UMAP Dimension 2")

    plt.tight_layout()
    plt.show()
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
No description has been provided for this image
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
No description has been provided for this image
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
No description has been provided for this image
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
No description has been provided for this image
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
No description has been provided for this image
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
No description has been provided for this image
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
No description has been provided for this image
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
No description has been provided for this image

obove we can see the same for every coin by itself. There ist not specific indicate, that some clusters tend to be for scams or no scams. For Example the plots for Avalanche (no Scam) and Bitforex (scam) look similar.